rm(list = ls(all = TRUE))
#library(quanteda)
#library(ggplot2)

# DATA AND PATHS
# --------------
load("./generated_data/1-corpus_and_wfm.RData")
load("./generated_data/3-budget_data_with_estimated_positions.RData")

# only keep ministers who participated in the budget debate (i.e.,
# drop those for which estimated policy positions are not available)
data <- data[!is.na(data$ws.cab.rescaled),]

# remove speakers from the Taoiseach and Finance departments that are not PM or FM
data <- data[!((data$portfolio=="Taoiseach" | data$portfolio=="Finance") & data$pm==0 & data$fm==0),]


# data frame to hold results
N <- length(unique(data$year))
dstats <- data.frame(
    year=sort(unique(data$year)),
    n.obs=rep(NA,N),
    n.portfolios=rep(NA,N),
    doc.length=rep(NA,N))


# NUMBER OF PORTFOLIOS PER YEAR
# -------------------------------
t2 <- with(data, table(year, portfolio))
t2[t2>0] <- 1
print(rowSums(t2))
print(round(mean(rowSums(t2))),0)
dstats$n.portfolios <- rowSums(t2)



# NUMBER OF OBSERVATIONS PER YEAR
# -------------------------------
# Note: number of obs in regression models is
d <- data[!(data$portfolio=="Taoiseach" | data$portfolio=="Finance"),]
d <- d[d$debate.year>1998,]
nrow(d)

# Number of unique speakers
t1 <- table(data$year[!duplicated(data$yearMemberID)])

print(t1)
print(sum(t1))
print(round(mean(t1),0))
dstats$n.obs <- t1

# CALCULATE DOC LENGTH FOR SPEAKERS
# ---------------------------------
years <- sort(unique(data$debate.year))
d <- data.frame(year=years, length=NA)


for (i in years) {
    avg.length <- mean(rowSums(wfm.list[[as.character(i)]][(rownames(wfm.list[[as.character(i)]]) %in% data$memberID[data$debate.year==i]),]))
    d$length[d$year==i] <- avg.length
    
}

print(round(d, 0))
print(round(mean(d$length)),0)

dstats$doc.length <- round(d$length,0)


## # save df as csv file
## write.csv(dstats,
##           file="./tables/appendix_table1.1-data_overview.csv",
##           row.names = FALSE)

# PRINT TABLE 1.1 IN APPENDIX
print(dstats)


# INTRA- AND INTER-DEPARTMENT SPEAKER DISTANCE
# --------------------------------------------

# Average distance between speakers from the same dept
# ----------------------------------------------------
# drop ref docs
d <- data[!(data$portfolio=="Taoiseach" | data$portfolio=="Finance"),]

aggMax <- aggregate(d$ws.cab.rescaled,by=list(d$budget.year,d$portfolio), max)
names(aggMax) <- c("year","portfolio","max")

aggMin <- aggregate(d$ws.cab.rescaled,by=list(d$budget.year,d$portfolio), min)
names(aggMin) <- c("year","portfolio","min")
aggObs <- aggregate(rep(1,nrow(d)),by=list(d$budget.year,d$portfolio), sum)
names(aggObs) <- c("year","portfolio","obs")

a <- merge(aggMax,aggMin)
a <- merge(a,aggObs)

table(a$obs)
sum(a$obs>1)
nrow(d)
round(sum(a$obs>1)/nrow(d)*100,0)

# In 39 cases were there more than one speaker from the same dept
# With N=168 (excluding refernence documents), this corresponds to 23%

a$dist <- a$max - a$min

round(mean(a$dist[a$obs>1]),2)
round(sd(a$dist[a$obs>1]),2)

# The average distance between speakers from the same department
# across all years is 0.42 (sd=0.33)

# Average distance between speakers from different dept across all years
aggMax <- aggregate(d$ws.cab.rescaled,by=list(d$budget.year), max)
names(aggMax) <- c("year","max")

aggMin <- aggregate(d$ws.cab.rescaled,by=list(d$budget.year), min)
names(aggMin) <- c("year","min")


a <- merge(aggMax,aggMin)

a$dist <- a$max - a$min

round(mean(a$dist),2)
round(sd(a$dist),2)

# The average distance between speakers from different departments in
# each debate is 1.01 (sd=0.31)



# PLOT CHANGES IN CAPITAL AND CURRENT BUDGET SHARES
# FOR OBSERVATIONS INCLUDED IN THE REGRESSION MODELS
d <- data[!(data$portfolio=="Taoiseach" | data$portfolio=="Finance") & data$debate.year>1998, c("change.capital.share", "change.current.share")]

d <- d[!is.na(d$change.capital.share),]

p <- ggplot(d, aes(x=change.capital.share)) +
    theme_bw() +
    geom_histogram(bins=50)  +
    scale_x_continuous(breaks=seq(-2.5, 1.5, 0.5)) +
    labs(
        title="Change in capital share",
        x ="Annual change in percentage points",
        y = "Count") +
    theme(text = element_text(size=22))

pdf("./plots/appendix_figure1.4-distribution_change_capital_share.pdf", width=10, height=6)
print(p)
dev.off()

p <- ggplot(d, aes(x=change.current.share)) +
    theme_bw() +
    geom_histogram(bins=50)  +
    scale_x_continuous(breaks=seq(-2, 6, 0.5)) +
    labs(
        title="Change in current share",
        x ="Annual change in percentage points",
        y = "Count") +
    theme(text = element_text(size=22))

pdf("./plots/appendix_figure1.4-distribution_change_current_share.pdf", width=10, height=6)
print(p)
dev.off()

